{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "# Build Network\n",
    "With NetSim, I have a tool that is capable of simulating a network. with implanted subnetworks.\n",
    "Now, I want to use it already generated networks to build the input for training datafor a GCN.\n",
    "\n",
    "This notebook will take a network and some RNA-seq counts to form a data set that can serve as input to a GCN model.\n",
    "What it does:\n",
    "1. Loading network, RNA-seq counts & differential expression\n",
    "2. Using the insert positions from the network to assign the insert nodes with the most differentially expressed feature vectors\n",
    "3. Form a h5py file that can serve as input for the GCN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import networkx as nx\n",
    "import numpy as np\n",
    "import h5py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# params\n",
    "BALANCE = False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Loading the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# read the network and insert positions\n",
    "network = nx.read_edgelist('../data/simulation/network.edgelist')\n",
    "insert_positions = []\n",
    "with open('../data/simulation/implant_positions.txt', 'r') as f:\n",
    "    for line in f.readlines():\n",
    "        if line.startswith('#'): # comment\n",
    "            pass\n",
    "        elif line.startswith('Subnetwork'):\n",
    "            positions = line.split(':')[1].strip().split('\\t')\n",
    "            insert_positions.append([int(i) for i in positions])\n",
    "\n",
    "# read differential expression (to get log2FoldChange as ranking)\n",
    "de = pd.DataFrame.from_csv('../data/differential_expression/deseq2_gfppT16_vs_ControlT16.csv')\n",
    "de.dropna(axis=0, inplace=True)\n",
    "\n",
    "# read features (RNA-seq counts) & train/test masks\n",
    "data_file = '../data/preprocessing/legionella_gcn_input.h5'\n",
    "with h5py.File(data_file, 'r') as f:\n",
    "    features = f['features'][:]\n",
    "    node_names = f['gene_names'][:]\n",
    "    y_train = f['y_train'][:]\n",
    "    y_test = f['y_test'][:]\n",
    "    if 'y_val' in f:\n",
    "        y_val = f['y_val'][:]\n",
    "    else:\n",
    "        y_val = None\n",
    "    train_mask = f['mask_train'][:]\n",
    "    test_mask = f['mask_test'][:]\n",
    "    if 'mask_val' in f:\n",
    "        val_mask = f['mask_val'][:]\n",
    "    else:\n",
    "        val_mask = None\n",
    "features_df = pd.DataFrame(features, index=node_names[:, 0])\n",
    "# build label column for features\n",
    "features_df['label'] = y_train[:, 0] | y_test[:, 0]\n",
    "assert (features_df.label.sum() == len(insert_positions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Assign Real Features to Simulated Nodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# assign disease genes to first nodes of subnetworks\n",
    "features_df['node'] = np.nan\n",
    "first_nodes = [i[0] for i in insert_positions]\n",
    "features_df.loc[features_df.label == 1, 'node'] = first_nodes\n",
    "# for each of the other nodes, assign the gene with highest differential expression\n",
    "# first, remove disease genes from DE\n",
    "labels = features_df[features_df.label == 1].index\n",
    "diff_expr = de[de.index.isin(features_df.index) & ~de.index.isin(labels)]\n",
    "sorted_de = diff_expr.sort_values(by='log2FoldChange', ascending=False)\n",
    "\n",
    "# Now, use the DE log2FoldChange to find genes for neighbors of disease genes\n",
    "idx = 0\n",
    "assigned_nodes = list(features_df[~features_df.node.isnull()].node.values)\n",
    "assert (len(assigned_nodes) == len(set(assigned_nodes))) # no duplicates here\n",
    "for node_position in range(1, len(insert_positions[0])):\n",
    "    for subnet in range(len(insert_positions)):\n",
    "        ens_id = sorted_de.iloc[[idx]].index\n",
    "        features_df.loc[ens_id, 'node'] = insert_positions[subnet][node_position]\n",
    "        assigned_nodes.append(insert_positions[subnet][node_position])\n",
    "        idx += 1\n",
    "assert (len(assigned_nodes) == len(set(assigned_nodes)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "# Now, randomly assign features to the other nodes\n",
    "features_not_assigned = features_df[features_df.node.isnull()]\n",
    "already_assigned = np.array(insert_positions).flatten()\n",
    "node_assignments = np.random.choice(list(network.nodes()),\n",
    "                                    size=nx.number_of_nodes(network),\n",
    "                                    replace=False)\n",
    "node_assignments = [int(i) for i in node_assignments if not int(i) in already_assigned]\n",
    "to_assign = features_df[features_df.node.isnull()].sample(n=len(node_assignments), replace=False).index\n",
    "features_df.loc[features_df.index.isin(to_assign), 'node'] = node_assignments\n",
    "assigned_features = features_df[~features_df.node.isnull()]\n",
    "features_df[~features_df.node.isnull()].shape\n",
    "\n",
    "# finally, remove remaining features\n",
    "final_features = features_df.dropna(axis=0)\n",
    "final_features = final_features.set_index('node')\n",
    "#final_features = final_features.drop('label', axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Compute Labels\n",
    "As a final step, I have to compute training and test splits for the nodes. Then, I have to construct labels for training and testing set and masks for both."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "737 316\n"
     ]
    }
   ],
   "source": [
    "# construct y and masks for training and testing\n",
    "labels = pd.get_dummies(final_features.label)\n",
    "labels.columns = ['Non_Label', 'Label']\n",
    "pos_train = labels[labels.Label == 1].sample(frac=0.7, replace=False)\n",
    "if BALANCE:\n",
    "    neg_train = labels[labels.Non_Label == 1].sample(pos_train.shape[0], replace=False)\n",
    "else:\n",
    "    neg_train = labels[labels.Non_Label == 1].sample(frac=0.7, replace=False)\n",
    "train = pd.concat([pos_train, neg_train])\n",
    "assert (pos_train.index.isin(final_features[final_features.label == 1].index).all())\n",
    "assert (neg_train.index.isin(final_features[final_features.label == 0].index).all())\n",
    "train_mask = final_features.index.isin(train.index)\n",
    "test_mask = ~final_features.index.isin(train.index)\n",
    "print (train_mask.sum(axis=0), test_mask.sum(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "train_nodes = train[train.Label == 1]\n",
    "y_train = np.logical_and(final_features.label == 1, final_features.index.isin(train.index))\n",
    "y_train = np.logical_not(pd.get_dummies(y_train).values)\n",
    "\n",
    "test_nodes = labels[~labels.index.isin(pos_train.index)].index\n",
    "pos_test_nodes =  final_features[final_features.index.isin(test_nodes) & final_features.label == 1]\n",
    "y_test = pd.get_dummies(~final_features.index.isin(pos_test_nodes.index)).values\n",
    "y_test.shape, y_test.sum(axis=0)\n",
    "assert (final_features[np.logical_and(train_mask == 1, y_train[:, 0] == 1)].index.isin(train.index).all())\n",
    "assert (final_features[np.logical_and(test_mask == 1, y_test[:, 0] == 1)].index.isin(test_nodes).all())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1053, 2) (1053, 2)\n",
      "(1053,) (1053,)\n",
      "(1053, 25)\n"
     ]
    }
   ],
   "source": [
    "print (y_train.shape, y_test.shape)\n",
    "print (train_mask.shape, test_mask.shape)\n",
    "print (final_features.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "deletable": true,
    "editable": true
   },
   "source": [
    "## Write To Container"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": [
    "y_train = np.array([y_train[:, 0]]).T\n",
    "y_test = np.array([y_test[:, 0]]).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Container written to ../data/simulation/simulated_input_legionella_unbalanced.h5\n"
     ]
    }
   ],
   "source": [
    "network_np = nx.to_pandas_dataframe(network)\n",
    "node_names = final_features.index.values\n",
    "node_names = np.vstack([node_names, final_features.index.values]).transpose(1, 0) # just stack node numbers\n",
    "real_disease_genes = np.array([i[0] for i in insert_positions])\n",
    "final_features.drop('label', axis=1, inplace=True)\n",
    "\n",
    "if BALANCE:\n",
    "    fname = '../data/simulation/simulated_input_legionella_balanced.h5'\n",
    "else:\n",
    "    fname = '../data/simulation/simulated_input_legionella_unbalanced.h5'\n",
    "f = h5py.File(fname, 'w')\n",
    "f.create_dataset('network', data=network_np, shape=network_np.shape)\n",
    "f.create_dataset('features', data=final_features, shape=final_features.shape)\n",
    "f.create_dataset('gene_names', data=node_names, shape=node_names.shape)\n",
    "f.create_dataset('real_disease_genes', data=real_disease_genes, shape=real_disease_genes.shape)\n",
    "\n",
    "f.create_dataset('y_train', data=y_train, shape=y_train.shape)\n",
    "f.create_dataset('y_test', data=y_test, shape=y_test.shape)\n",
    "f.create_dataset('mask_train', data=train_mask, shape=train_mask.shape)\n",
    "f.create_dataset('mask_test', data=test_mask, shape=test_mask.shape)\n",
    "f.close()\n",
    "print (\"Container written to {}\".format(fname))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "270"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(y_train[:,0]*10).sum(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1053, 2) [  11 1042]\n",
      "(1053, 2) [  27 1026]\n",
      "(1053,) 737\n",
      "(1053,) 316\n"
     ]
    }
   ],
   "source": [
    "with h5py.File('../data/simulation/simulated_input_legionella_unbalanced.h5', 'r') as f:\n",
    "    network = f['network'][:]\n",
    "    features = f['features'][:]\n",
    "    node_names = f['gene_names'][:]\n",
    "    y_train = f['y_train'][:]\n",
    "    y_test = f['y_test'][:]\n",
    "    if 'y_val' in f:\n",
    "        y_val = f['y_val'][:]\n",
    "    else:\n",
    "        y_val = None\n",
    "    train_mask = f['mask_train'][:]\n",
    "    test_mask = f['mask_test'][:]\n",
    "    if 'mask_val' in f:\n",
    "        val_mask = f['mask_val'][:]\n",
    "    else:\n",
    "        val_mask = None\n",
    "print (y_test.shape, y_test.sum(axis=0))\n",
    "print (y_train.shape, y_train.sum(axis=0))\n",
    "print (train_mask.shape, train_mask.sum(axis=0))\n",
    "print (test_mask.shape, test_mask.sum(axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false,
    "deletable": true,
    "editable": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1053, 2)\n"
     ]
    }
   ],
   "source": [
    "print (node_names.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "deletable": true,
    "editable": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
